An influential book by Leland Wilkinson with this title lays out a theoretical framework to systematically construct quantitative graphics. Hadley Wickham extended these and related ideas and implemented them in the ggplot2 package. If you are interested in more details, this paper would be a great starting point: http://vita.had.co.nz/papers/layered-grammar.pdf.
Note that these ideas deal only with static graphics.
At a very high level, this is what you would do with ggplot:
ggplot() callgeom_*() or stat_*() calls. Some properties (e.g. position) of these layers can be set in these calls.aes() for the full plot or per-layer. This deals with selecting variables to plot, colours, sizes, shapes of the plot elements etc. Aesthetics can be “mapped” to variables in the data.Resources:
For a quick overview of ggplot, see https://r4ds.had.co.nz/data-visualisation.html.
To learn and practice more, the Rmd files here are great: https://github.com/hadley/ggplot2-book
We load the data saved by the tidy.Rmd file (part-2 of the tutorial).
library(tidyverse)
data <- read_csv("data/ua_populations.csv")
sex_ratio_change <- read_csv("data/sex_ratio_change.csv")
Create a tidy population data set
pop_data <- data %>%
rename(total = population, male = pop_male, female = pop_female) %>%
select(-c(area, pop_change, pop_change_percent)) %>%
gather(sex, population, total, male, female)
pop_data
## # A tibble: 8,331 x 5
## ua_no ua year sex population
## <dbl> <chr> <dbl> <chr> <dbl>
## 1 1 Greater Mumbai U.A. 1961 total 4515495
## 2 1 Greater Mumbai U.A. 1971 total 6596370
## 3 1 Greater Mumbai U.A. 1981 total 9421962
## 4 1 Greater Mumbai U.A. 1991 total 12596243
## 5 1 Greater Mumbai U.A. 2001 total 16434386
## 6 1 Greater Mumbai U.A. 2011 total 18394912
## 7 2 Delhi U.A. 1961 total 2359408
## 8 2 Delhi U.A. 1971 total 3647023
## 9 2 Delhi U.A. 1981 total 5760811
## 10 2 Delhi U.A. 1991 total 8471625
## # ... with 8,321 more rows
pop_data %>%
filter(sex != "total" & year == 1961) %>%
ggplot(aes(x = log(population), fill = sex, alpha = 0.5)) +
geom_density(position = "identity")
Try changing geom_density() to geom_histogram(). Also try the position = "dodge" option. You could also add facet_wrap() to get the distributions for the years separately.
library(ggridges)
pop_data %>%
filter(sex == "total") %>%
ggplot(aes(x = log10(population),
y = factor(year, levels = rev(levels(factor(year)))))) +
stat_density_ridges()
pop_data %>%
filter(ua == "Vapi ^" & sex == "total") %>%
ggplot(aes(x = year, y = population)) +
geom_line()
gap_increasing <- sex_ratio_change %>%
top_n(20, sex_ratio_slope)
gap_decreasing <- sex_ratio_change %>%
top_n(-20, sex_ratio_slope)
p <- pop_data %>%
semi_join(gap_decreasing, by = "ua_no") %>%
filter(sex == "total") %>%
ggplot(aes(x = year, y = population)) +
geom_line() +
facet_wrap(~ua, scales = "free_y")
p + theme(axis.text.x = element_text(angle = 45, hjust = 1))
Disparity closing
pop_data %>%
semi_join(gap_decreasing, by = "ua_no") %>%
filter(sex != "total") %>%
ggplot() +
geom_line(aes(x = year, y = population, colour = sex)) +
facet_wrap(~ua, scales = "free_y", ncol = 4, strip.position = "bottom") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Disparity increasing
pop_data %>%
semi_join(gap_increasing, by = "ua_no") %>%
filter(sex != "total") %>%
ggplot() +
geom_line(aes(x = year, y = population, colour = sex)) +
facet_wrap(~ua, scales = "free_y", ncol = 4, strip.position = "bottom") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pop_data %>%
filter(year == 2011 & sex == "total") %>%
arrange(desc(population)) %>%
top_n(30) %>%
ggplot() +
geom_bar(aes(x = reorder(factor(ua), population), y = population), stat = "identity") +
coord_flip() +
xlab("City") +
scale_y_continuous(label = scales::comma)
cities_scatter <- data %>%
filter(year == 2011) %>%
mutate(
sex_ratio = pop_male / pop_female,
pop_density = population / area
) %>%
ggplot(aes(x = log10(population), y = pop_density, label = ua)) +
geom_point(aes(size = log10(population),
colour = sex_ratio,
alpha = 0.4)) +
scale_color_gradient(low = "green", high = "red") +
scale_y_reverse()
cities_scatter
library(ggrepel)
cities_scatter_labelled <- cities_scatter +
geom_label_repel(aes(label = ifelse(sex_ratio > quantile(sex_ratio, 0.98, na.rm = TRUE) |
sex_ratio < quantile(sex_ratio, 0.02, na.rm = TRUE) |
pop_density > quantile(pop_density, 0.98, na.rm = TRUE) |
pop_density < quantile(pop_density, 0.02, na.rm = TRUE),
ua, '')))
cities_scatter_labelled
plotly::ggplotly(cities_scatter)
Let’s try something fancy!
#library(animation)
mega_cities <- data %>%
filter(year == 2011 & population > 3000000 & ua_no != 6) %>%
select(ua_no)
draw_scatter <- function(y) {
p <- data %>%
filter(year == y) %>%
semi_join(mega_cities, by = "ua_no") %>%
mutate(
sex_ratio = pop_male / pop_female,
pop_density = population / area
) %>%
ggplot(aes(x = sex_ratio, y = pop_density, label = ua)) +
geom_point(aes(size = sqrt(population) / 300,
colour = ua,
alpha = 0.5)) +
xlim(.8, 1.8) +
ylim(25000, 0) +
guides(size = FALSE, alpha = FALSE) +
scale_size_identity()
ggsave(paste0("images/anim_", y, ".png"), plot = p)
p
}
#saveGIF(c(1961, 1971, 1981, 1991, 2001, 2011) %>% map(get_scatter),
# interval = .2,
# movie.name = "megacities_evolution.gif")
scatter_plots <- c(1961, 1971, 1981, 1991, 2001, 2011) %>% map(draw_scatter)
shell("magick convert -delay 100 -loop 1 images/anim*.png images/megacities_evolution.gif")
{width = 200px}